Purpose

This document summarizes Rick Gilmore’s analysis of the Jaccard index data.

The goal is to explore ways to generate an empirical “null” distribution of the Jaccard index data to compare it to the observed data.

Set-up

Note: I have set eval=FALSE for a series of chunks below that generate the permuted sorting and Jaccard index data. These take many minutes to run.

Analysis plan

  1. Start with each participant’s raw sorting data within each wallpaper group.
  2. Permute the exemplars each participant sorted into similar piles by randomizing the mapping between the actual exemplar and the permuted exemplar id.
  1. Recalculate the Jaccard indices using the analysis/make.jaccard.df.R function.
  2. Do 1-3 n times, where n is large, probably 1,000.
  3. Compare the mean Jaccard indices (by wallpaper group) from the permuted set to the values we observed empirically.

Preliminary work

Let’s build and test a permutation function for the raw sorting data.

Now, let’s generate multiple permuted CSVs.

generate_n_sorting_permutations <-
  function(wp_group = "P1",
           n_permutations = 5) {
    csv_in <- paste0("analysis/data/", wp_group, "-sorting.csv")
    if (!file.exists(csv_in)) {
      stop(paste0("`csv_in` not found: ", csv_in))
    }
    
    df_in <- readr::read_csv(csv_in)
    
    df_exemplars <- df_in[, -c(1, 2, 23, 24)]
    out_m <- as.matrix(df_exemplars)
    for (p in 1:n_permutations) {
      csv_out <-
        paste0(
          "analysis/data/permutation_analysis/sorting_csv/",
          wp_group,
          "-sorting-perm-",
          stringr::str_pad(p, 3, pad = 0), ".csv"
        )
      
      for (r in 1:dim(out_m)[1]) {
        new_i <- sample(1:20)
        out_m[r, 1:20] <- out_m[r, new_i]
      }
      
      array_out <-
        as.data.frame(cbind(df_in$Participant, df_in$Set, out_m, df_in$Set_size, df_in$Group))
      
      # Rename!
      names(array_out) <-
        c("Participant",
          "Set",
          names(df_exemplars),
          "Set_size",
          "Group")
      array_out
    
      readr::write_csv(array_out, csv_out)
    }
  }

Then we test it.

generate_n_sorting_permutations()
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   Participant = col_character(),
##   Group = col_character()
## )
## ℹ Use `spec()` for the full column specifications.

Now, let’s confirm that we can calculate Jaccard indices from these data.

make_jaccard_csvs <- function(wallpaper_group = "P1",
                              duplicates = FALSE,
                              input_dir = 'analysis/data/permutation_analysis/sorting_csv/',
                              output_dir = 'analysis/data/permutation_analysis/jaccards/') {
  # Makes a data.frame from the raw sorting data
  
  # Load externals
  source("analysis/jaccard.data.R")
  source("analysis/jaccard.R")
  
  these_csvs <-
    list.files(input_dir, paste0("^", wallpaper_group, "\\-"), full.names = TRUE)
  purrr::map(these_csvs,
             calculate_save_jaccard_df,
             wallpaper_group,
             jaccard_dir = output_dir)
}

# Load a sorting permutation file, calculate the Jaccard indices, and (conditionally) save it to file.
calculate_save_jaccard_df <- function(this_csv,
                                      wallpaper_group,
                                      save_output = TRUE,
                                      jaccard_dir = "analysis/data/permutation_analysis/jaccards/",
                                      vb = FALSE) {
  
  this_fn <- basename(this_csv)
  this_perm_number <- stringr::str_extract(this_fn, "[0-9]{3}")
  out_fn <-
    paste0(jaccard_dir,
           wallpaper_group,
           "-jaccard-",
           this_perm_number,
           ".csv")
  
  this_df <- readr::read_csv(this_csv)
  
  # Calculate Jaccard
  jaccard_df <- jaccard.data(this_df)
  
  if (save_output) {
    if (vb) message(paste0('Saving ', out_fn))
    readr::write_csv(jaccard_df, out_fn)
  } else {
    jaccard_df
  }
}
make_jaccard_csvs()

Generate data

P1

generate_n_sorting_permutations("P1", n_permutations = 999)
make_jaccard_csvs("P1")

P31M

generate_n_sorting_permutations("P31M", n_permutations = 999)
make_jaccard_csvs("P31M")

P3M1

generate_n_sorting_permutations("P3M1", n_permutations = 999)
make_jaccard_csvs("P3M1")

P6

generate_n_sorting_permutations("P6", n_permutations = 999)
make_jaccard_csvs("P6")

P6M

generate_n_sorting_permutations("P6M", n_permutations = 999)
make_jaccard_csvs("P6M")

Analyze simulated results

Create helper functions

make_perm_jaccard_df <- function(this_csv) {
  this_fn <- basename(this_csv)
  this_perm_number <- stringr::str_extract(this_fn, "[0-9]{3}")
  this_df <- readr::read_csv(this_csv)
  
  this_df <- this_df %>%
    dplyr::mutate(
      .,
      exemplar_pair = paste0(
        stringr::str_extract(Exemplar.Row, "[0-9]{3}$"),
        "-",
        stringr::str_extract(Exemplar.Col, "[0-9]{3}$")
        ),
        perm = this_perm_number
    )
  
  this_df
}
make_aggregate_perm_jaccard_df <- function(wp_group = "P1",
                                           input_dir = "analysis/data/permutation_analysis/jaccards",
                                           save_csv = TRUE,
                                           output_dir = "analysis/data/permutation_analysis/aggregates",
                                           vb = TRUE) {
  these_csvs <-
    list.files(input_dir, paste0("^", wp_group, "\\-"), full.names = TRUE)
  df <- purrr::map_df(these_csvs, make_perm_jaccard_df)
  
  if (save_csv) {
    out_fn <- file.path(output_dir, paste0(wp_group, "-aggregate-perm-jaccard.csv"))
    if (vb) message(paste0("Saving ", out_fn))
    readr::write_csv(df, out_fn)
  } else {
    df
  }
}

Empirical distributions of Jaccard indices by group and exemplar-pair

P1

Import the data.

P1_perm_df <- make_aggregate_perm_jaccard_df("P1")
P1_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P1-aggregate-perm-jaccard.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Exemplar.Row = col_double(),
##   Exemplar.Col = col_double(),
##   Jaccard = col_double(),
##   Group = col_character(),
##   exemplar_pair = col_character(),
##   perm = col_character()
## )

Visualize.

P1_perm_df %>%
  ggplot2::ggplot(.) +
  ggplot2::aes(x = Jaccard) +
  ggplot2::geom_histogram(bins = 50)

Generate summary stats by exemplar pair.

P1_perm_stats_df <- P1_perm_df %>%
  dplyr::group_by(., Group, exemplar_pair) %>%
  dplyr::summarize(., jaccard_mean = mean(Jaccard),
                   jaccard_sd = sd(Jaccard))
## `summarise()` has grouped output by 'Group'. You can override using the `.groups` argument.

P31M

Import the data.

P31M_perm_df <- make_aggregate_perm_jaccard_df("P31M")
P31M_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P31M-aggregate-perm-jaccard.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Exemplar.Row = col_double(),
##   Exemplar.Col = col_double(),
##   Jaccard = col_double(),
##   Group = col_character(),
##   exemplar_pair = col_character(),
##   perm = col_character()
## )

Visualize.

P31M_perm_df %>%
  ggplot2::ggplot(.) +
  ggplot2::aes(x = Jaccard) +
  ggplot2::geom_histogram(bins = 50)

Generate summary stats by exemplar pair.

P31M_perm_stats_df <- P31M_perm_df %>%
  dplyr::group_by(., Group, exemplar_pair) %>%
  dplyr::summarize(., jaccard_mean = mean(Jaccard),
                   jaccard_sd = sd(Jaccard))
## `summarise()` has grouped output by 'Group'. You can override using the `.groups` argument.

P3M1

Import the data.

P3M1_perm_df <- make_aggregate_perm_jaccard_df("P3M1")
P3M1_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P3M1-aggregate-perm-jaccard.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Exemplar.Row = col_double(),
##   Exemplar.Col = col_double(),
##   Jaccard = col_double(),
##   Group = col_character(),
##   exemplar_pair = col_character(),
##   perm = col_character()
## )

Visualize.

P3M1_perm_df %>%
  ggplot2::ggplot(.) +
  ggplot2::aes(x = Jaccard) +
  ggplot2::geom_histogram(bins = 50)

Generate summary stats by exemplar pair.

P3M1_perm_stats_df <- P3M1_perm_df %>%
  dplyr::group_by(., Group, exemplar_pair) %>%
  dplyr::summarize(., jaccard_mean = mean(Jaccard),
                   jaccard_sd = sd(Jaccard))
## `summarise()` has grouped output by 'Group'. You can override using the `.groups` argument.

P6

Import the data.

P6_perm_df <- make_aggregate_perm_jaccard_df("P6")
P6_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P6-aggregate-perm-jaccard.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Exemplar.Row = col_double(),
##   Exemplar.Col = col_double(),
##   Jaccard = col_double(),
##   Group = col_character(),
##   exemplar_pair = col_character(),
##   perm = col_character()
## )

Visualize.

P6_perm_df %>%
  ggplot2::ggplot(.) +
  ggplot2::aes(x = Jaccard) +
  ggplot2::geom_histogram(bins = 50)

Generate summary stats by exemplar pair.

P6_perm_stats_df <- P6_perm_df %>%
  dplyr::group_by(., Group, exemplar_pair) %>%
  dplyr::summarize(., jaccard_mean = mean(Jaccard),
                   jaccard_sd = sd(Jaccard))
## `summarise()` has grouped output by 'Group'. You can override using the `.groups` argument.

P6M

Import the data.

P6M_perm_df <- make_aggregate_perm_jaccard_df("P6M")
P6M_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P6M-aggregate-perm-jaccard.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Exemplar.Row = col_double(),
##   Exemplar.Col = col_double(),
##   Jaccard = col_double(),
##   Group = col_character(),
##   exemplar_pair = col_character(),
##   perm = col_character()
## )

Visualize.

P6M_perm_df %>%
  ggplot2::ggplot(.) +
  ggplot2::aes(x = Jaccard) +
  ggplot2::geom_histogram(bins = 50)

Generate summary stats by exemplar pair.

P6M_perm_stats_df <- P6M_perm_df %>%
  dplyr::group_by(., Group, exemplar_pair) %>%
  dplyr::summarize(., jaccard_mean = mean(Jaccard),
                   jaccard_sd = sd(Jaccard))
## `summarise()` has grouped output by 'Group'. You can override using the `.groups` argument.

Aggregating across groups

jaccard_perm_df <- rbind(P1_perm_df, P31M_perm_df, P3M1_perm_df, P6_perm_df, P6M_perm_df)

Visualization.

jaccard_perm_df %>%
  ggplot(.) +
  aes(Jaccard, color = Group) +
  facet_grid(Group ~ .) +
  geom_boxplot(bins = 50)
## Warning: Ignoring unknown parameters: bins

jaccard_perm_df %>%
  ggplot(.) +
  aes(Jaccard, color = Group) +
  facet_grid(Group ~ .) +
  geom_boxplot(bins = 50)
## Warning: Ignoring unknown parameters: bins

jaccard_perm_df %>%
  ggplot(.) +
  aes(x = Group, y = Jaccard) +
  geom_violin()

These plots show that the mean differences in Jaccard indices are reflected in the participants’ data are shown in the permuted data, too. This makes sense since the participants detected regularities and sorted the exemplars into different numbers of sets. In permuting the exemplar identifiers within participants, we keep some of this structure.

Let’s try aggregating the by-exemplar statistics.

jaccard_perm_stats_df <- rbind(P1_perm_stats_df, P31M_perm_stats_df, P3M1_perm_stats_df, P6_perm_stats_df, P6M_perm_stats_df)

# Sort by group, exemplar_pair
jaccard_perm_stats_df <- jaccard_perm_stats_df %>%
  dplyr::arrange(Group, exemplar_pair)
jaccard_perm_stats_df %>%
  ggplot(.) +
  aes(x = jaccard_mean, fill = Group) +
  geom_histogram() +
  facet_grid(Group ~ .) + 
  ggtitle("Mean exemplar-pair Jaccard indices for permuted data")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

jaccard_perm_stats_df %>%
  ggplot(.) +
  aes(x = jaccard_sd, fill = Group) +
  geom_histogram() +
  facet_grid(Group ~ .) +
  ggtitle("Standard deviation of exemplar-pair Jaccard indices for permuted data")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Compare observed Jaccard to empirical distribution

Load the observed data and clean it.

jaccard_observed_df <-
  readr::read_csv("analysis/data/jaccard-no-duplicates.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Exemplar.Row = col_double(),
##   Exemplar.Col = col_double(),
##   Jaccard = col_double(),
##   Group = col_character()
## )
jaccard_observed_df <- jaccard_observed_df %>%
  dplyr::mutate(.,
                exemplar_pair = paste0(
                  stringr::str_extract(Exemplar.Row, "[0-9]{3}$"),
                  "-",
                  stringr::str_extract(Exemplar.Col, "[0-9]{3}$")
                )) %>%
  dplyr::arrange(., Group, exemplar_pair)

Now, merge the permuted data with the observed data.

jaccard_merged_df <- dplyr::left_join(jaccard_perm_stats_df,
                                      jaccard_observed_df,
                                      by = c("Group", "exemplar_pair"))

# Rearrange columns for convenience
jaccard_merged_df <- jaccard_merged_df %>%
  dplyr::select(., Group, exemplar_pair, Jaccard, jaccard_mean, jaccard_sd, Exemplar.Row, Exemplar.Col)

# Rename variables for clarity
jaccard_merged_df <- jaccard_merged_df %>%
  dplyr::rename(., group = Group, jaccard_obs = Jaccard, 
                jaccard_emp_mean = jaccard_mean,
                jaccard_emp_sd = jaccard_sd,
                exemplar_row = Exemplar.Row,
                exemplar_col = Exemplar.Col)

Calculate empirical z as \(z_{emp}=J_{obs}-\mu_{J}\) for each exemplar pair.

jaccard_merged_df <- jaccard_merged_df %>%
  dplyr::mutate(., z_emp = (jaccard_obs-jaccard_emp_mean)/jaccard_emp_sd,
                p_z_emp = pnorm(z_emp, jaccard_emp_mean, jaccard_emp_sd, lower.tail = FALSE))

Plot a histogram of z_emp.

jaccard_merged_df %>%
  ggplot(.) +
  aes(z_emp, fill = group) +
  geom_histogram() +
  facet_grid(group ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Curiously, P31M, P6, P6M and P3M1 have exemplar pairs whose observed Jaccard indices are substantially larger than the empirically derived reference (null) distribution even though the mean Jaccard indices for P1 are the largest.

Just for fun, let’s print the exemplar pairs whose z_emp exceeds some criterion \(z\) values.

We’ll use a two-tailed criterion value.

\(z_{obs}>\) 3.8905919, \(p<.0001\)

p_crit <- .0001
z_crit <- qnorm(p = 1-.5*(p_crit))
    
j_emp_p0001 <- jaccard_merged_df %>%
  dplyr::filter(., z_emp > z_crit) %>%
  dplyr::arrange(., group, desc(jaccard_emp_mean)) 

knitr::kable(j_emp_p0001)
group exemplar_pair jaccard_obs jaccard_emp_mean jaccard_emp_sd exemplar_row exemplar_col z_emp p_z_emp
P1 008-009 0.4347826 0.1997302 0.0523893 101008 101009 4.486650 0
P31M 006-016 0.3469388 0.1544913 0.0455907 115006 115016 4.221202 0
P31M 002-020 0.4347826 0.1541413 0.0445330 115002 115020 6.301875 0
P31M 004-009 0.4666667 0.1536301 0.0458028 115004 115009 6.834437 0
P31M 008-015 0.3469388 0.1534583 0.0454080 115008 115015 4.260934 0
P31M 014-020 0.5000000 0.1532609 0.0443117 115014 115020 7.824999 0
P31M 007-020 0.3469388 0.1530137 0.0472758 115007 115020 4.101998 0
P31M 002-007 0.6500000 0.1528091 0.0456546 115002 115007 10.890260 0
P31M 002-014 0.4347826 0.1523665 0.0464296 115002 115014 6.082672 0
P31M 007-014 0.3469388 0.1520057 0.0465605 115007 115014 4.186665 0
P31M 009-014 0.3469388 0.1501004 0.0442869 115009 115014 4.444622 0
P3M1 019-020 0.4042553 0.1436215 0.0475534 114019 114020 5.480869 0
P3M1 003-016 0.3469388 0.1431156 0.0447568 114003 114016 4.554010 0
P3M1 011-019 0.3750000 0.1415372 0.0445837 114011 114019 5.236508 0
P6 001-017 0.3200000 0.1384481 0.0436607 116001 116017 4.158245 0
P6 007-009 0.4666667 0.1380211 0.0441697 116007 116009 7.440517 0
P6 014-017 0.3200000 0.1376545 0.0451294 116014 116017 4.040509 0
P6 013-019 0.4888889 0.1373315 0.0433875 116013 116019 8.102725 0
P6 005-013 0.3333333 0.1365565 0.0442138 116005 116013 4.450579 0
P6 002-011 0.4042553 0.1363300 0.0431172 116002 116011 6.213881 0
P6 008-009 0.3541667 0.1361674 0.0434391 116008 116009 5.018501 0
P6 008-020 0.3265306 0.1359853 0.0425070 116008 116020 4.482680 0
P6 006-019 0.4666667 0.1356848 0.0441123 116006 116019 7.503171 0
P6 003-014 0.3265306 0.1354559 0.0438507 116003 116014 4.357394 0
P6 006-013 0.5581395 0.1348380 0.0450430 116006 116013 9.397711 0
P6M 008-010 0.3333333 0.1448769 0.0455945 117008 117010 4.133318 0
P6M 008-020 0.3541667 0.1437420 0.0410528 117008 117020 5.125709 0
P6M 001-011 0.3333333 0.1433422 0.0465964 117001 117011 4.077378 0
P6M 010-020 0.3829787 0.1432528 0.0453747 117010 117020 5.283256 0
xtabs(~ group, j_emp_p0001)
## group
##   P1 P31M P3M1   P6  P6M 
##    1   10    3   11    4

\(z_{obs}>\) 3.2905267, \(p<.001\)

p_crit <- .001
z_crit <- qnorm(p = 1-.5*(p_crit))
                
j_emp_p001 <- jaccard_merged_df %>%
  dplyr::filter(., z_emp > z_crit) %>%
  dplyr::arrange(., group, desc(jaccard_emp_mean))

knitr::kable(j_emp_p001)
group exemplar_pair jaccard_obs jaccard_emp_mean jaccard_emp_sd exemplar_row exemplar_col z_emp p_z_emp
P1 016-020 0.3750000 0.2026614 0.0511486 101016 101020 3.369369 0
P1 010-016 0.4042553 0.2019869 0.0520852 101010 101016 3.883416 0
P1 007-019 0.3750000 0.2007621 0.0501171 101007 101019 3.476615 0
P1 008-009 0.4347826 0.1997302 0.0523893 101008 101009 4.486650 0
P31M 006-016 0.3469388 0.1544913 0.0455907 115006 115016 4.221202 0
P31M 002-020 0.4347826 0.1541413 0.0445330 115002 115020 6.301875 0
P31M 004-009 0.4666667 0.1536301 0.0458028 115004 115009 6.834437 0
P31M 008-015 0.3469388 0.1534583 0.0454080 115008 115015 4.260934 0
P31M 014-020 0.5000000 0.1532609 0.0443117 115014 115020 7.824999 0
P31M 007-020 0.3469388 0.1530137 0.0472758 115007 115020 4.101998 0
P31M 002-007 0.6500000 0.1528091 0.0456546 115002 115007 10.890260 0
P31M 002-014 0.4347826 0.1523665 0.0464296 115002 115014 6.082672 0
P31M 006-019 0.3200000 0.1522070 0.0470140 115006 115019 3.569003 0
P31M 007-014 0.3469388 0.1520057 0.0465605 115007 115014 4.186665 0
P31M 009-014 0.3469388 0.1501004 0.0442869 115009 115014 4.444622 0
P3M1 008-012 0.3200000 0.1467831 0.0481581 114008 114012 3.596836 0
P3M1 019-020 0.4042553 0.1436215 0.0475534 114019 114020 5.480869 0
P3M1 003-016 0.3469388 0.1431156 0.0447568 114003 114016 4.554010 0
P3M1 008-014 0.2941176 0.1420000 0.0448338 114008 114014 3.392920 0
P3M1 008-013 0.2941176 0.1418432 0.0454661 114008 114013 3.349187 0
P3M1 011-019 0.3750000 0.1415372 0.0445837 114011 114019 5.236508 0
P3M1 015-020 0.2941176 0.1410353 0.0429412 114015 114020 3.564931 0
P3M1 003-010 0.2941176 0.1406768 0.0447781 114003 114010 3.426691 0
P6 001-017 0.3200000 0.1384481 0.0436607 116001 116017 4.158245 0
P6 007-009 0.4666667 0.1380211 0.0441697 116007 116009 7.440517 0
P6 014-017 0.3200000 0.1376545 0.0451294 116014 116017 4.040509 0
P6 013-019 0.4888889 0.1373315 0.0433875 116013 116019 8.102725 0
P6 005-013 0.3333333 0.1365565 0.0442138 116005 116013 4.450579 0
P6 002-011 0.4042553 0.1363300 0.0431172 116002 116011 6.213881 0
P6 008-009 0.3541667 0.1361674 0.0434391 116008 116009 5.018501 0
P6 008-020 0.3265306 0.1359853 0.0425070 116008 116020 4.482680 0
P6 006-019 0.4666667 0.1356848 0.0441123 116006 116019 7.503171 0
P6 012-016 0.2941176 0.1356635 0.0426123 116012 116016 3.718506 0
P6 003-014 0.3265306 0.1354559 0.0438507 116003 116014 4.357394 0
P6 006-013 0.5581395 0.1348380 0.0450430 116006 116013 9.397711 0
P6M 001-009 0.3265306 0.1465073 0.0478002 117001 117009 3.766165 0
P6M 010-013 0.3061224 0.1455597 0.0449610 117010 117013 3.571153 0
P6M 003-018 0.3125000 0.1455053 0.0459697 117003 117018 3.632710 0
P6M 008-010 0.3333333 0.1448769 0.0455945 117008 117010 4.133318 0
P6M 013-016 0.3061224 0.1444453 0.0459377 117013 117016 3.519484 0
P6M 002-005 0.3000000 0.1440870 0.0464512 117002 117005 3.356491 0
P6M 008-020 0.3541667 0.1437420 0.0410528 117008 117020 5.125709 0
P6M 001-011 0.3333333 0.1433422 0.0465964 117001 117011 4.077378 0
P6M 010-020 0.3829787 0.1432528 0.0453747 117010 117020 5.283256 0
P6M 015-018 0.3125000 0.1423195 0.0447096 117015 117018 3.806353 0
xtabs(~ group, j_emp_p001)
## group
##   P1 P31M P3M1   P6  P6M 
##    4   11    8   12   10

\(z_{obs}>\) 2.5758293, \(p<.01\)

p_crit <- .01
z_crit <- qnorm(p = 1-.5*(p_crit))

j_emp_p01 <- jaccard_merged_df %>%
  dplyr::filter(., z_emp > z_crit) %>%
  dplyr::arrange(., group, desc(jaccard_emp_mean))

knitr::kable(j_emp_p01)
group exemplar_pair jaccard_obs jaccard_emp_mean jaccard_emp_sd exemplar_row exemplar_col z_emp p_z_emp
P1 016-020 0.3750000 0.2026614 0.0511486 101016 101020 3.369369 0
P1 010-015 0.3469388 0.2025179 0.0526086 101010 101015 2.745197 0
P1 010-016 0.4042553 0.2019869 0.0520852 101010 101016 3.883416 0
P1 007-019 0.3750000 0.2007621 0.0501171 101007 101019 3.476615 0
P1 010-020 0.3469388 0.1999799 0.0512336 101010 101020 2.868407 0
P1 008-009 0.4347826 0.1997302 0.0523893 101008 101009 4.486650 0
P31M 006-016 0.3469388 0.1544913 0.0455907 115006 115016 4.221202 0
P31M 002-020 0.4347826 0.1541413 0.0445330 115002 115020 6.301875 0
P31M 004-009 0.4666667 0.1536301 0.0458028 115004 115009 6.834437 0
P31M 008-015 0.3469388 0.1534583 0.0454080 115008 115015 4.260934 0
P31M 014-020 0.5000000 0.1532609 0.0443117 115014 115020 7.824999 0
P31M 004-014 0.2941176 0.1530242 0.0461496 115004 115014 3.057303 0
P31M 007-020 0.3469388 0.1530137 0.0472758 115007 115020 4.101998 0
P31M 015-019 0.2941176 0.1529676 0.0473107 115015 115019 2.983468 0
P31M 002-007 0.6500000 0.1528091 0.0456546 115002 115007 10.890260 0
P31M 002-014 0.4347826 0.1523665 0.0464296 115002 115014 6.082672 0
P31M 006-019 0.3200000 0.1522070 0.0470140 115006 115019 3.569003 0
P31M 011-016 0.2941176 0.1520355 0.0482082 115011 115016 2.947260 0
P31M 007-014 0.3469388 0.1520057 0.0465605 115007 115014 4.186665 0
P31M 013-018 0.2941176 0.1514879 0.0469310 115013 115018 3.039140 0
P31M 010-013 0.2941176 0.1511875 0.0466110 115010 115013 3.066448 0
P31M 011-015 0.2692308 0.1504190 0.0458878 115011 115015 2.589182 0
P31M 009-014 0.3469388 0.1501004 0.0442869 115009 115014 4.444622 0
P3M1 008-012 0.3200000 0.1467831 0.0481581 114008 114012 3.596836 0
P3M1 011-020 0.2692308 0.1437663 0.0464568 114011 114020 2.700667 0
P3M1 019-020 0.4042553 0.1436215 0.0475534 114019 114020 5.480869 0
P3M1 003-016 0.3469388 0.1431156 0.0447568 114003 114016 4.554010 0
P3M1 008-014 0.2941176 0.1420000 0.0448338 114008 114014 3.392920 0
P3M1 008-013 0.2941176 0.1418432 0.0454661 114008 114013 3.349187 0
P3M1 003-017 0.2941176 0.1418300 0.0474815 114003 114017 3.207302 0
P3M1 012-017 0.2692308 0.1417132 0.0456949 114012 114017 2.790632 0
P3M1 011-019 0.3750000 0.1415372 0.0445837 114011 114019 5.236508 0
P3M1 013-017 0.2692308 0.1412909 0.0445063 114013 114017 2.874649 0
P3M1 015-020 0.2941176 0.1410353 0.0429412 114015 114020 3.564931 0
P3M1 003-010 0.2941176 0.1406768 0.0447781 114003 114010 3.426691 0
P6 001-017 0.3200000 0.1384481 0.0436607 116001 116017 4.158245 0
P6 007-009 0.4666667 0.1380211 0.0441697 116007 116009 7.440517 0
P6 001-018 0.2692308 0.1379620 0.0438256 116001 116018 2.995250 0
P6 017-018 0.2692308 0.1378323 0.0450538 116017 116018 2.916482 0
P6 014-017 0.3200000 0.1376545 0.0451294 116014 116017 4.040509 0
P6 015-018 0.2500000 0.1374907 0.0435567 116015 116018 2.583053 0
P6 013-019 0.4888889 0.1373315 0.0433875 116013 116019 8.102725 0
P6 007-008 0.2500000 0.1369494 0.0434299 116007 116008 2.603059 0
P6 005-013 0.3333333 0.1365565 0.0442138 116005 116013 4.450579 0
P6 002-011 0.4042553 0.1363300 0.0431172 116002 116011 6.213881 0
P6 008-009 0.3541667 0.1361674 0.0434391 116008 116009 5.018501 0
P6 008-020 0.3265306 0.1359853 0.0425070 116008 116020 4.482680 0
P6 006-019 0.4666667 0.1356848 0.0441123 116006 116019 7.503171 0
P6 012-016 0.2941176 0.1356635 0.0426123 116012 116016 3.718506 0
P6 004-010 0.2692308 0.1356585 0.0453429 116004 116010 2.945823 0
P6 003-014 0.3265306 0.1354559 0.0438507 116003 116014 4.357394 0
P6 006-013 0.5581395 0.1348380 0.0450430 116006 116013 9.397711 0
P6M 006-017 0.2857143 0.1469697 0.0447717 117006 117017 3.098934 0
P6M 001-009 0.3265306 0.1465073 0.0478002 117001 117009 3.766165 0
P6M 010-013 0.3061224 0.1455597 0.0449610 117010 117013 3.571153 0
P6M 003-018 0.3125000 0.1455053 0.0459697 117003 117018 3.632710 0
P6M 008-010 0.3333333 0.1448769 0.0455945 117008 117010 4.133318 0
P6M 013-016 0.3061224 0.1444453 0.0459377 117013 117016 3.519484 0
P6M 001-013 0.2800000 0.1441186 0.0475129 117001 117013 2.859881 0
P6M 002-005 0.3000000 0.1440870 0.0464512 117002 117005 3.356491 0
P6M 008-020 0.3541667 0.1437420 0.0410528 117008 117020 5.125709 0
P6M 010-019 0.2800000 0.1437261 0.0444244 117010 117019 3.067549 0
P6M 013-020 0.2745098 0.1435044 0.0457516 117013 117020 2.863408 0
P6M 001-011 0.3333333 0.1433422 0.0465964 117001 117011 4.077378 0
P6M 010-020 0.3829787 0.1432528 0.0453747 117010 117020 5.283256 0
P6M 008-013 0.2800000 0.1431326 0.0453443 117008 117013 3.018404 0
P6M 015-018 0.3125000 0.1423195 0.0447096 117015 117018 3.806353 0
xtabs(~ group, j_emp_p01)
## group
##   P1 P31M P3M1   P6  P6M 
##    6   17   12   17   15

\(z_{obs}>\) 1.959964, \(p<.05\)

p_crit <- .05
z_crit <- qnorm(p = 1-.5*(p_crit))

j_emp_p05 <- jaccard_merged_df %>%
  dplyr::filter(., z_emp > z_crit) %>%
  dplyr::arrange(., group, desc(jaccard_emp_mean))

knitr::kable(j_emp_p05)
group exemplar_pair jaccard_obs jaccard_emp_mean jaccard_emp_sd exemplar_row exemplar_col z_emp p_z_emp
P1 016-020 0.3750000 0.2026614 0.0511486 101016 101020 3.369369 0
P1 010-015 0.3469388 0.2025179 0.0526086 101010 101015 2.745197 0
P1 002-011 0.3265306 0.2022794 0.0509974 101002 101011 2.436422 0
P1 004-006 0.3200000 0.2020502 0.0520419 101004 101006 2.266437 0
P1 016-017 0.3200000 0.2019904 0.0517065 101016 101017 2.282295 0
P1 010-016 0.4042553 0.2019869 0.0520852 101010 101016 3.883416 0
P1 019-020 0.3200000 0.2019041 0.0514376 101019 101020 2.295904 0
P1 003-015 0.3200000 0.2016781 0.0544723 101003 101015 2.172147 0
P1 001-015 0.3200000 0.2015624 0.0525580 101001 101015 2.253462 0
P1 007-012 0.3200000 0.2014882 0.0542545 101007 101012 2.184367 0
P1 007-015 0.3200000 0.2012488 0.0520508 101007 101015 2.281448 0
P1 007-019 0.3750000 0.2007621 0.0501171 101007 101019 3.476615 0
P1 010-020 0.3469388 0.1999799 0.0512336 101010 101020 2.868407 0
P1 008-009 0.4347826 0.1997302 0.0523893 101008 101009 4.486650 0
P31M 006-016 0.3469388 0.1544913 0.0455907 115006 115016 4.221202 0
P31M 009-020 0.2692308 0.1543772 0.0455129 115009 115020 2.523536 0
P31M 002-020 0.4347826 0.1541413 0.0445330 115002 115020 6.301875 0
P31M 016-017 0.2692308 0.1539553 0.0456523 115016 115017 2.525074 0
P31M 010-015 0.2452830 0.1537649 0.0460724 115010 115015 1.986398 0
P31M 004-009 0.4666667 0.1536301 0.0458028 115004 115009 6.834437 0
P31M 006-015 0.2692308 0.1535847 0.0466486 115006 115015 2.479090 0
P31M 008-015 0.3469388 0.1534583 0.0454080 115008 115015 4.260934 0
P31M 014-020 0.5000000 0.1532609 0.0443117 115014 115020 7.824999 0
P31M 004-014 0.2941176 0.1530242 0.0461496 115004 115014 3.057303 0
P31M 007-020 0.3469388 0.1530137 0.0472758 115007 115020 4.101998 0
P31M 015-019 0.2941176 0.1529676 0.0473107 115015 115019 2.983468 0
P31M 002-007 0.6500000 0.1528091 0.0456546 115002 115007 10.890260 0
P31M 004-020 0.2452830 0.1527105 0.0458227 115004 115020 2.020234 0
P31M 008-019 0.2692308 0.1524873 0.0456175 115008 115019 2.559180 0
P31M 003-005 0.2452830 0.1524094 0.0458820 115003 115005 2.024186 0
P31M 001-008 0.2452830 0.1523676 0.0459000 115001 115008 2.024300 0
P31M 002-014 0.4347826 0.1523665 0.0464296 115002 115014 6.082672 0
P31M 003-010 0.2452830 0.1523455 0.0468872 115003 115010 1.982150 0
P31M 010-016 0.2692308 0.1522625 0.0462861 115010 115016 2.527071 0
P31M 006-019 0.3200000 0.1522070 0.0470140 115006 115019 3.569003 0
P31M 008-012 0.2452830 0.1520735 0.0468120 115008 115012 1.991144 0
P31M 013-016 0.2692308 0.1520686 0.0464882 115013 115016 2.520255 0
P31M 011-016 0.2941176 0.1520355 0.0482082 115011 115016 2.947260 0
P31M 006-011 0.2452830 0.1520303 0.0471372 115006 115011 1.978325 0
P31M 007-014 0.3469388 0.1520057 0.0465605 115007 115014 4.186665 0
P31M 005-018 0.2452830 0.1514984 0.0465938 115005 115018 2.012812 0
P31M 013-018 0.2941176 0.1514879 0.0469310 115013 115018 3.039140 0
P31M 010-013 0.2941176 0.1511875 0.0466110 115010 115013 3.066448 0
P31M 002-009 0.2452830 0.1511269 0.0451101 115002 115009 2.087248 0
P31M 013-019 0.2692308 0.1508800 0.0464156 115013 115019 2.549807 0
P31M 010-011 0.2452830 0.1506905 0.0468336 115010 115011 2.019757 0
P31M 011-015 0.2692308 0.1504190 0.0458878 115011 115015 2.589182 0
P31M 009-014 0.3469388 0.1501004 0.0442869 115009 115014 4.444622 0
P3M1 008-012 0.3200000 0.1467831 0.0481581 114008 114012 3.596836 0
P3M1 003-018 0.2452830 0.1447031 0.0463087 114003 114018 2.171942 0
P3M1 006-009 0.2452830 0.1444126 0.0423229 114006 114009 2.383354 0
P3M1 004-019 0.2452830 0.1438403 0.0464733 114004 114019 2.182815 0
P3M1 011-020 0.2692308 0.1437663 0.0464568 114011 114020 2.700667 0
P3M1 019-020 0.4042553 0.1436215 0.0475534 114019 114020 5.480869 0
P3M1 003-016 0.3469388 0.1431156 0.0447568 114003 114016 4.554010 0
P3M1 016-018 0.2452830 0.1428453 0.0441724 114016 114018 2.319040 0
P3M1 005-007 0.2452830 0.1426746 0.0441964 114005 114007 2.321646 0
P3M1 008-014 0.2941176 0.1420000 0.0448338 114008 114014 3.392920 0
P3M1 008-013 0.2941176 0.1418432 0.0454661 114008 114013 3.349187 0
P3M1 003-017 0.2941176 0.1418300 0.0474815 114003 114017 3.207302 0
P3M1 002-012 0.2452830 0.1417180 0.0458268 114002 114012 2.259921 0
P3M1 012-017 0.2692308 0.1417132 0.0456949 114012 114017 2.790632 0
P3M1 007-015 0.2452830 0.1416145 0.0453829 114007 114015 2.284309 0
P3M1 011-019 0.3750000 0.1415372 0.0445837 114011 114019 5.236508 0
P3M1 013-017 0.2692308 0.1412909 0.0445063 114013 114017 2.874649 0
P3M1 015-020 0.2941176 0.1410353 0.0429412 114015 114020 3.564931 0
P3M1 004-011 0.2452830 0.1409725 0.0456179 114004 114011 2.286614 0
P3M1 003-010 0.2941176 0.1406768 0.0447781 114003 114010 3.426691 0
P3M1 010-011 0.2452830 0.1404196 0.0459566 114010 114011 2.281792 0
P6 001-017 0.3200000 0.1384481 0.0436607 116001 116017 4.158245 0
P6 007-009 0.4666667 0.1380211 0.0441697 116007 116009 7.440517 0
P6 001-018 0.2692308 0.1379620 0.0438256 116001 116018 2.995250 0
P6 017-018 0.2692308 0.1378323 0.0450538 116017 116018 2.916482 0
P6 014-017 0.3200000 0.1376545 0.0451294 116014 116017 4.040509 0
P6 005-007 0.2407407 0.1375631 0.0450893 116005 116007 2.288294 0
P6 015-018 0.2500000 0.1374907 0.0435567 116015 116018 2.583053 0
P6 013-019 0.4888889 0.1373315 0.0433875 116013 116019 8.102725 0
P6 005-019 0.2407407 0.1370936 0.0432307 116005 116019 2.397533 0
P6 014-018 0.2452830 0.1370532 0.0432897 116014 116018 2.500131 0
P6 010-012 0.2222222 0.1369815 0.0428191 116010 116012 1.990716 0
P6 004-020 0.2452830 0.1369799 0.0452552 116004 116020 2.393162 0
P6 007-008 0.2500000 0.1369494 0.0434299 116007 116008 2.603059 0
P6 005-013 0.3333333 0.1365565 0.0442138 116005 116013 4.450579 0
P6 002-011 0.4042553 0.1363300 0.0431172 116002 116011 6.213881 0
P6 017-019 0.2452830 0.1362403 0.0439990 116017 116019 2.478302 0
P6 008-009 0.3541667 0.1361674 0.0434391 116008 116009 5.018501 0
P6 008-020 0.3265306 0.1359853 0.0425070 116008 116020 4.482680 0
P6 006-019 0.4666667 0.1356848 0.0441123 116006 116019 7.503171 0
P6 012-016 0.2941176 0.1356635 0.0426123 116012 116016 3.718506 0
P6 004-010 0.2692308 0.1356585 0.0453429 116004 116010 2.945823 0
P6 004-012 0.2452830 0.1355501 0.0440347 116004 116012 2.491967 0
P6 003-014 0.3265306 0.1354559 0.0438507 116003 116014 4.357394 0
P6 006-013 0.5581395 0.1348380 0.0450430 116006 116013 9.397711 0
P6M 006-017 0.2857143 0.1469697 0.0447717 117006 117017 3.098934 0
P6M 001-009 0.3265306 0.1465073 0.0478002 117001 117009 3.766165 0
P6M 005-019 0.2500000 0.1459680 0.0447321 117005 117019 2.325667 0
P6M 008-009 0.2500000 0.1458103 0.0472835 117008 117009 2.203510 0
P6M 010-013 0.3061224 0.1455597 0.0449610 117010 117013 3.571153 0
P6M 003-018 0.3125000 0.1455053 0.0459697 117003 117018 3.632710 0
P6M 015-019 0.2549020 0.1452188 0.0454548 117015 117019 2.413016 0
P6M 014-017 0.2549020 0.1450481 0.0474809 117014 117017 2.313644 0
P6M 009-019 0.2500000 0.1449205 0.0471529 117009 117019 2.228485 0
P6M 008-010 0.3333333 0.1448769 0.0455945 117008 117010 4.133318 0
P6M 013-016 0.3061224 0.1444453 0.0459377 117013 117016 3.519484 0
P6M 005-017 0.2500000 0.1443246 0.0469896 117005 117017 2.248911 0
P6M 001-013 0.2800000 0.1441186 0.0475129 117001 117013 2.859881 0
P6M 002-005 0.3000000 0.1440870 0.0464512 117002 117005 3.356491 0
P6M 008-020 0.3541667 0.1437420 0.0410528 117008 117020 5.125709 0
P6M 010-019 0.2800000 0.1437261 0.0444244 117010 117019 3.067549 0
P6M 013-020 0.2745098 0.1435044 0.0457516 117013 117020 2.863408 0
P6M 001-011 0.3333333 0.1433422 0.0465964 117001 117011 4.077378 0
P6M 010-020 0.3829787 0.1432528 0.0453747 117010 117020 5.283256 0
P6M 008-013 0.2800000 0.1431326 0.0453443 117008 117013 3.018404 0
P6M 015-018 0.3125000 0.1423195 0.0447096 117015 117018 3.806353 0
xtabs(~ group, j_emp_p05)
## group
##   P1 P31M P3M1   P6  P6M 
##   14   34   21   24   21

Histogram with annotations

jaccard_merged_df %>%
  ggplot(.) +
  aes(x = z_emp, fill = group) +
  geom_histogram(bins = 20) +
  geom_vline(xintercept = qnorm(1-.5*(.0001)), color = "black") +
  geom_vline(xintercept = qnorm(1-.5*(.01)), color = "gray50") +
  facet_grid(group ~ .)